{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "一共有 6742 封邮件。\n",
      "\n",
      "第1封邮件未清洗前的内容为: \n",
      " B6\n",
      "Thursday, March 3, 2011 9:45 PM\n",
      "H: Latest How Syria is aiding Qaddafi and more... Sid\n",
      "hrc memo syria aiding libya 030311.docx; hrc memo syria aiding libya 030311.docx\n",
      "March 3, 2011\n",
      "For: Hillary \n",
      "\n",
      "第1封邮件去除停用词并处理成gensim需要的格式为：\n",
      " ['latest', 'syria', 'aiding', 'qaddafi', 'sid', 'hrc', 'memo', 'syria', 'aiding', 'libya', 'docx', 'hrc', 'memo', 'syria', 'aiding', 'libya', 'docx', 'hillary'] \n",
      "\n"
     ]
    }
   ],
   "source": [
    "#!/usr/bin/python\n",
    "# -*- coding:utf-8 -*-\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "from gensim import corpora, models, similarities\n",
    "import gensim\n",
    "\n",
    "\"\"\"第一步：用正则表达式清洗数据，并去除停用词\"\"\"\n",
    "df = pd.read_csv(\"HillaryEmails.csv\")\n",
    "# 原邮件数据中有很多Nan的值，直接扔了。\n",
    "df = df[['Id','ExtractedBodyText']].dropna()\n",
    "\n",
    "# 用正则表达式清洗数据\n",
    "def clean_email_text(text):\n",
    "    text = text.replace('\\n',\" \")                        # 新行，我们是不需要的\n",
    "    text = re.sub(r\"-\", \" \", text)                       # 把 \"-\" 的两个单词，分开。（比如：july-edu ==> july edu）\n",
    "    text = re.sub(r\"\\d+/\\d+/\\d+\", \"\", text)              # 日期，对主体模型没什么意义\n",
    "    text = re.sub(r\"[0-2]?[0-9]:[0-6][0-9]\", \"\", text)   # 时间，没意义\n",
    "    text = re.sub(r\"[\\w]+@[\\.\\w]+\", \"\", text)            # 邮件地址，没意义\n",
    "    text = re.sub(r\"/[a-zA-Z]*[:\\//\\]*[A-Za-z0-9\\-_]+\\.+[A-Za-z0-9\\.\\/%&=\\?\\-_]+/i\", \"\", text)    # 网址，没意义\n",
    "    \n",
    "    # 以防还有其他除了单词以外的特殊字符（数字）等等，我们把特殊字符过滤掉\n",
    "    # 只留下字母和空格\n",
    "    # 再把单个字母去掉，留下单词\n",
    "    pure_text = ''\n",
    "    for letter in text:\n",
    "        if letter.isalpha() or letter==' ':\n",
    "            pure_text += letter\n",
    "            \n",
    "    text = ' '.join(word for word in pure_text.split() if len(word)>1)\n",
    "    return text\n",
    "\n",
    "docs_text = df['ExtractedBodyText']\n",
    "docs = docs_text.apply(lambda s: clean_email_text(s))  \n",
    "\n",
    "# 得到所有邮件的内容\n",
    "doclist = docs.values\n",
    "print(\"一共有\",len(doclist),\"封邮件。\\n\")\n",
    "print(\"第1封邮件未清洗前的内容为: \\n\",docs_text.iloc[0],'\\n')\n",
    "\n",
    "# 去除停用词，处理成gensim需要的输入格式\n",
    "stopwords = [word.strip() for word in open('./stopwords.txt','r').readlines()]\n",
    "# 每一封邮件都有星期和月份，这里也把他们过滤掉\n",
    "weeks = ['monday','mon','tuesday','tues','wednesday','wed','thursday','thur','friday','fri','saturday','sat','sunday','sun']\n",
    "months = ['jan','january','feb','february','mar','march','apr','april','may','jun','june','jul',\\\n",
    "          'july','aug','august','sept','september','oct','october','nov','november','dec','december']\n",
    "stoplist = stopwords+weeks+months+['am','pm']\n",
    "texts = [[word for word in doc.lower().split() if word not in stoplist] for doc in doclist]\n",
    "\n",
    "texts = [[word for word in doc.lower().split() if word not in stoplist] for doc in doclist]\n",
    "print(\"第1封邮件去除停用词并处理成gensim需要的格式为：\\n\",texts[0],'\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "第1封邮件ID化后的结果为：\n",
      " [(0, 3), (1, 2), (2, 1), (3, 2), (4, 1), (5, 2), (6, 2), (7, 1), (8, 1), (9, 3)] \n",
      "\n",
      "0.010*\"would\" + 0.006*\"obama\" + 0.006*\"one\" + 0.006*\"president\" + 0.005*\"israel\" + 0.005*\"people\" + 0.005*\"us\" + 0.004*\"percent\" + 0.004*\"said\" + 0.004*\"state\"\n",
      "[(0, '0.013*\"senate\" + 0.011*\"yes\" + 0.009*\"bloomberg\" + 0.005*\"state\" + 0.004*\"week\" + 0.004*\"house\" + 0.004*\"bill\" + 0.004*\"see\" + 0.004*\"get\" + 0.004*\"next\"'), (1, '0.029*\"call\" + 0.025*\"ok\" + 0.015*\"thx\" + 0.015*\"see\" + 0.014*\"im\" + 0.011*\"part\" + 0.010*\"talk\" + 0.010*\"release\" + 0.008*\"qddr\" + 0.008*\"tomorrow\"'), (2, '0.008*\"email\" + 0.007*\"negotiating\" + 0.006*\"fyi\" + 0.004*\"eu\" + 0.004*\"would\" + 0.004*\"good\" + 0.004*\"message\" + 0.004*\"back\" + 0.003*\"us\" + 0.003*\"call\"'), (3, '0.032*\"pls\" + 0.030*\"print\" + 0.009*\"pis\" + 0.009*\"copy\" + 0.008*\"sbwhoeop\" + 0.007*\"sid\" + 0.006*\"fw\" + 0.005*\"website\" + 0.005*\"ashton\" + 0.005*\"sent\"'), (4, '0.010*\"would\" + 0.007*\"one\" + 0.007*\"could\" + 0.007*\"time\" + 0.006*\"like\" + 0.006*\"know\" + 0.005*\"mr\" + 0.005*\"much\" + 0.005*\"good\" + 0.005*\"also\"'), (5, '0.035*\"office\" + 0.029*\"fyi\" + 0.028*\"secretarys\" + 0.020*\"meeting\" + 0.019*\"room\" + 0.014*\"state\" + 0.012*\"department\" + 0.012*\"arrive\" + 0.012*\"en\" + 0.011*\"route\"'), (6, '0.008*\"call\" + 0.005*\"know\" + 0.005*\"today\" + 0.005*\"us\" + 0.005*\"get\" + 0.004*\"state\" + 0.004*\"said\" + 0.004*\"list\" + 0.004*\"well\" + 0.003*\"working\"'), (7, '0.010*\"us\" + 0.007*\"state\" + 0.006*\"new\" + 0.005*\"said\" + 0.005*\"government\" + 0.004*\"american\" + 0.004*\"security\" + 0.004*\"would\" + 0.004*\"states\" + 0.004*\"military\"'), (8, '0.008*\"us\" + 0.006*\"new\" + 0.005*\"people\" + 0.004*\"un\" + 0.004*\"also\" + 0.004*\"american\" + 0.004*\"said\" + 0.004*\"work\" + 0.004*\"one\" + 0.003*\"state\"'), (9, '0.010*\"would\" + 0.006*\"obama\" + 0.006*\"one\" + 0.006*\"president\" + 0.005*\"israel\" + 0.005*\"people\" + 0.005*\"us\" + 0.004*\"percent\" + 0.004*\"said\" + 0.004*\"state\"')]\n"
     ]
    }
   ],
   "source": [
    "\"\"\"第二步：构建字典，将文本ID化\"\"\"\n",
    "dictionary = corpora.Dictionary(texts)\n",
    "corpus = [dictionary.doc2bow(text) for text in texts]\n",
    "# 将每一篇邮件ID化\n",
    "print(\"第1封邮件ID化后的结果为：\\n\",corpus[0],'\\n')\n",
    "\n",
    "\"\"\"第三步：训练LDA模型\"\"\"\n",
    "lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)\n",
    "# 第10个主题的单词分布，取权重最高的前10个词\n",
    "print(lda.print_topic(9, topn=10))\n",
    "# 所有主题的单词分布\n",
    "print(lda.print_topics(num_topics=10, num_words=10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "第1封邮件的大致内容为：\n",
      " ['latest', 'syria', 'aiding', 'qaddafi', 'sid', 'hrc', 'memo', 'syria', 'aiding', 'libya', 'docx', 'hrc', 'memo', 'syria', 'aiding', 'libya', 'docx', 'hillary'] \n",
      "\n",
      "第1封邮件的主题分布为：\n",
      " [(3, 0.56100214), (8, 0.3968486)] \n",
      "\n"
     ]
    }
   ],
   "source": [
    "\"\"\"第四步：查看某封邮件所属的主题\"\"\"\n",
    "print(\"第1封邮件的大致内容为：\\n\",texts[0],'\\n')\n",
    "topic = lda.get_document_topics(corpus[0])\n",
    "print(\"第1封邮件的主题分布为：\\n\",topic,'\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "这两条推特的主题分布分别为：\n",
      " [(0, 0.0111170085), (1, 0.011118207), (2, 0.01111913), (3, 0.011116115), (4, 0.89994085), (5, 0.011116263), (6, 0.011116605), (7, 0.011117295), (8, 0.01111973), (9, 0.0111187985)] \n",
      " [(4, 0.9181052)]\n"
     ]
    }
   ],
   "source": [
    "# 希拉里发的两条推特\n",
    "# 给大伙翻译一下这两句：\n",
    "# 这是选举的一天!数以百万计的美国人投了希拉里的票。加入他们吧，确定你投给谁。\n",
    "# 希望今天每个人都能度过一个安乐的感恩节，和家人朋友共度美好时光——来自希拉里的问候。\n",
    "\n",
    "twitter = [\"It's Election Day! Millions of Americans have cast their votes for Hillary—join them and confirm where you vote \",\n",
    "       \"Hoping everyone has a safe & Happy Thanksgiving today, & quality time with family & friends. -H\"]\n",
    "\n",
    "text_twitter = [clean_email_text(s) for s in twitter]\n",
    "text_twitter = [[word for word in text.lower().split() if word not in stoplist] for text in text_twitter]\n",
    "corpus_twitter = [dictionary.doc2bow(text) for text in text_twitter]\n",
    "topics_twitter = lda.get_document_topics(corpus_twitter)\n",
    "print(\"这两条推特的主题分布分别为：\\n\",topics_twitter[0] ,'\\n',topics_twitter[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
